{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Unsupervised anomaly detection with One-Class SVM\n",
    "\n",
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_ip</th>\n",
       "      <th>username</th>\n",
       "      <th>success</th>\n",
       "      <th>failure_reason</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>datetime</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2018-01-01 00:06:19.353126</th>\n",
       "      <td>223.178.55.3</td>\n",
       "      <td>djones</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-01-01 00:09:07.147971</th>\n",
       "      <td>223.178.55.3</td>\n",
       "      <td>djones</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-01-01 01:08:08.610041</th>\n",
       "      <td>6.252.142.27</td>\n",
       "      <td>asmith</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-01-01 02:37:50.329298</th>\n",
       "      <td>124.178.25.98</td>\n",
       "      <td>akim</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-01-01 02:45:20.382080</th>\n",
       "      <td>98.43.141.103</td>\n",
       "      <td>akim</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                source_ip username  success failure_reason\n",
       "datetime                                                                  \n",
       "2018-01-01 00:06:19.353126   223.178.55.3   djones        1           None\n",
       "2018-01-01 00:09:07.147971   223.178.55.3   djones        1           None\n",
       "2018-01-01 01:08:08.610041   6.252.142.27   asmith        1           None\n",
       "2018-01-01 02:37:50.329298  124.178.25.98     akim        1           None\n",
       "2018-01-01 02:45:20.382080  98.43.141.103     akim        1           None"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import sqlite3\n",
    "\n",
    "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n",
    "    logs_2018 = pd.read_sql(\n",
    "        \"\"\"\n",
    "        SELECT * \n",
    "        FROM logs \n",
    "        WHERE datetime BETWEEN \"2018-01-01\" AND \"2019-01-01\";\n",
    "        \"\"\", \n",
    "        conn, parse_dates=['datetime'], index_col='datetime'\n",
    "    )\n",
    "logs_2018.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_X(log, day):\n",
    "    \"\"\"\n",
    "    Get data we can use for the X\n",
    "    \n",
    "    Parameters:\n",
    "        - log: The logs dataframe\n",
    "        - day: A day or single value we can use as a datetime index slice\n",
    "    \n",
    "    Returns: \n",
    "        A pandas DataFrame\n",
    "    \"\"\"\n",
    "    return pd.get_dummies(log[day].assign(\n",
    "        failures=lambda x:  1 - x.success\n",
    "    ).query('failures > 0').resample('1min').agg(\n",
    "        {'username':'nunique', 'failures': 'sum'}\n",
    "    ).dropna().rename(\n",
    "        columns={'username':'usernames_with_failures'}\n",
    "    ).assign(\n",
    "        day_of_week=lambda x: x.index.dayofweek, \n",
    "        hour=lambda x: x.index.hour\n",
    "    ).drop(columns=['failures']), columns=['day_of_week', 'hour'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['usernames_with_failures', 'day_of_week_0', 'day_of_week_1',\n",
       "       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',\n",
       "       'day_of_week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',\n",
       "       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',\n",
       "       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',\n",
       "       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = get_X(logs_2018, '2018-01')\n",
    "X.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## One-class SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\preprocessing\\data.py:645: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
      "  return self.partial_fit(X, y)\n",
      "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\base.py:464: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
      "  return self.fit(X, **fit_params).transform(X)\n",
      "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\svm\\classes.py:1177: DeprecationWarning: The random_state parameter is deprecated and will be removed in version 0.22.\n",
      "  \" be removed in version 0.22.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.svm import OneClassSVM\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "one_class_svm_pipeline = Pipeline([\n",
    "    ('scale', StandardScaler()),\n",
    "    ('svm', OneClassSVM(random_state=0))\n",
    "]).fit(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\pipeline.py:331: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
      "  Xt = transform.transform(Xt)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "outlier    22823\n",
       "inlier     18794\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = one_class_svm_pipeline.predict(X)\n",
    "pd.Series(np.where(preds == -1, 'outlier', 'inlier')).value_counts()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
